/*
 * main.cpp
 * Classifier
 *
 * Daniel Wojcik
 *
 */

#include <iostream>
#include <fstream>
#include <math.h>
#include <queue>
#include <stack>
#include "algorithms.h"

/*std::map<std::string, TermStat> gTerms;
std::list<DocItem> docs;
std::map<int, ClassStat> docClasses;
std::map<unsigned int, Cluster> clusters;
unsigned int docCount;
unsigned int termCount;*/

Globals global;
unsigned int nDocs;
bool learning = false;
unsigned int error;
unsigned int rate, cRate;
double sse, misses;

//Classification & Clustering paramenters go here

//void stats();
//void classify(DocItem& dcmt);
//void cClassify(DocItem& dcmt);
//void characterizeClasses();
//void computeWeights();
//void updateWeights(DocItem* dcmt);
//void cluster();
//void cull();
void save();
void saveClass();
void load();
void loadClass();
DocItem* getDoc(int n);
void top20();
//void classPrint();
//void clusterPrint();

int main (int argc, char** const argv)
{
	std::string str, str2;
	global.docCount = 0;
	global.termCount = 0;
	nDocs = 0;
	error = 0;
	misses = 0;
	sse = 0;
	rate = 16;
	cRate = 0;
	
	if (argc > 1)
	{
		int d = 2;
		bool read = false, ndb = false;
		if (strcmp(argv[1], "-n") == 0)
		{
			ndb = true;
			d++;
		}
		if (strcmp(argv[d-1], "-r") == 0)
		{
			rate = atoi(argv[d++]);
			d++;
		}
		if (strcmp(argv[d-1], "-l") == 0)
			learning = true;
		else if (strcmp(argv[d-1], "-L") == 0)
		{
			read = true;
			learning = true;
		}
		else if (strcmp(argv[d-1], "-C") == 0)
			read = true;
		else
			d = 1;
		//Check for -O outFileName, default to something?
		
		if (!ndb)
			load();
		
		while (d < argc)
		{
			std::ifstream argument(argv[d]);
			if (argument == NULL)
				break; //Error recovery goes here
			
			while (!argument.eof())
			{
				std::ifstream document;
				str2 = "";
				if (read)
				{
					argument >> str2;
					std::cout << "New file : " << str2 << "\n";
					document.open(str2.c_str());
					if (document == NULL)
					{
						continue;
						std::cout << "Fail\n";
					}
				}
				else
					document.open(argv[d]);
				
				DocItem dcmt = DocItem();
				//docClasses.clear();
			
				//Needs to update global counts manually now
				while (!document.eof())
				{
					str = "";
					document >> str;
					
					for (unsigned int i = 0; i < str.size(); i++)
					{
						if (isalpha(str[i]))
							str[i] = tolower(str[i]);
					}
					//std::cout << str << "\n";
					
					//Check for meta data
					if (str.compare("<meta") == 0)
					{
						while(1)
						{
							document >> str;
							if (category == 0 && str.compare("name='DC.date'") == 0)
							{
								std::string str1;
								document >> str1;
								str = str1.substr(9,4);
								dcmt.realClass[0] = str;//atoi(str.c_str());
								break;
							}
							else if (category == 1 && str.compare("name='DC.subject'") == 0)
							{
								std::string str1;
								document >> str1;
								str = str1.substr(9,str1.size()-11);
								dcmt.realClass[0] = str;
								break;
							}
							else if (str[str.size()-1] == '>')
								break;
						}
					}
					
					std::map<std::string, TermStat>::iterator itr;
					//itr = find(gTerms.begin(), gTerms.end(), str);
					itr = global.gTerms.find(str);
					if (itr != global.gTerms.end())
					{
						if (dcmt.getCount(itr->first) == 0)
						{
							dcmt.addTerm(itr->first);
							//Update inverted index
							itr->second.dCount++;
							itr->second.count++;
							//itr->second.invIndex.push_back(docCount);
						}
						else
						{
							dcmt.increment(str);
							itr->second.count++;
						}
					}
					else
					{
						global.gTerms[str].count = 1;
						global.gTerms[str].dCount = 1;
						//gTerms[str].invIndex.push_back(docCount);
						dcmt.addTerm(str);
					}
					global.termCount++;
				}
				
				//if (!read)
					document.close();
				global.docCount++;
				nDocs++;
				
				if (learning)
				{
					if (read)
					{
						//argument >> dcmt.classification;
						dcmt.classification[0] = dcmt.realClass[0];
					}
					else
					{
						//dcmt.classification = atoi(argv[++d]);
						dcmt.classification[0] = dcmt.realClass[0];
					}
				}
				else
					classify(dcmt, global); //Doing it this way means order of documents matters,
									//but otherwise would update weights with no class.
				updateWeights(&dcmt, global);
				cRate++;
				if (!learning && cRate >= rate) //Wait until the end to do this when learning
				{
					characterizeClasses(global);
					cluster(global);
					save();
					saveClass();
					cRate = 0;
				}
				//global.docs.push_back(dcmt);
				
				//Some variance checking, mostly only good for date stuff.
				//Need to get clustering working to automatically set up date range classes.
				if (category == 0)
				{
					unsigned int e = atoi(dcmt.classification[0].c_str());
					e-= atoi(dcmt.realClass[0].c_str());
					//unsigned int e = abs(dcmt.classification[0] - dcmt.realClass[0]);
					error+= abs(e);
				}
				if (dcmt.classification[0] != dcmt.realClass[0])
				{
					double d = classTypes;
					for (unsigned int i = 1; i < classTypes; i++)
					{
						if (dcmt.classification[i] == dcmt.realClass[0])
							break;
						else
							d--;
					}
					misses+= 1/d;
				}
			}
			argument.close();
			
			d++;
		}
	}
	else
		return 1;
	
	if (category == 0)
		std::cout << "Classification error: " << error/(float)nDocs << "\n";
	std::cout << "Classification accuracy: " << misses << " misses in " << nDocs;
	std::cout << " documents, " << (nDocs - misses)/(float)nDocs*100 << "%\n";
	characterizeClasses(global);
	//top20();
	//classPrint();
	cluster(global);
	save();
	saveClass();
	
    return 0;
}

void top20()
{
	std::cout << "Top 20\n";
	std::pair<std::string, TermStat> topTerms[20];
	unsigned int terms = 0;
	
	std::map<std::string, TermStat>::iterator itr = global.gTerms.begin();
	while (itr != global.gTerms.end())
	{
		TermStat term = itr->second;
		//if (term.idf < minIDF || term.count < minKeep*(docCount/supportScale))
		//	gTerms.erase(itr++);
		//else
		{
			if (terms < 20)
			{
				std::pair<std::string, TermStat> p;
				p.first = itr->first;
				p.second = itr->second;
				topTerms[terms] = p;
				terms++;
			}
			else
			{
				for (unsigned int i = 0; i < terms; i++)
				{
					if (itr->second.count > topTerms[i].second.count)
					{
						std::pair<std::string, TermStat> p;
						p.first = itr->first;
						p.second = itr->second;
						topTerms[i] = p;
						break;
					}
				}
			}
			itr++;
		}
	}
	
	for (unsigned int i = 0; i < 20; i++)
	{
		std::cout << topTerms[i].first << " " << topTerms[i].second.count << 
			" " << topTerms[i].second.idf << "\n";
	}
}

//Write current state of classification knowledge to a file so
//that it can be loaded later. Useful for incremental updates.
void save()
{
	std::cout << "Saving\n";
	std::ofstream docOut("brain");
	if (docOut == NULL)
		return;
	docOut<< "= " << global.docCount << "\n";
	docOut<< "? " << global.gTerms.size() << "\n";
	
	std::map<std::string, TermStat>::iterator itr = global.gTerms.begin();
	
	while (itr != global.gTerms.end())
	{
		TermStat term = itr->second;
		if (term.count >= minKeep*(global.docCount/(float)supportScale) && term.idf >= minIDF)
		{
			docOut<< "+ "<< itr->first<< " "<< term.count<< " "<< term.dCount<< " "<< term.idf<< "\n";
		
			std::map<std::string,unsigned int>::iterator wItr = term.cCounts.begin();
			while (wItr != term.cCounts.end())
			{
				docOut<< "- "<< wItr->first<< " "<< wItr->second << "\n";
				wItr++;
			}
			
			itr++;
		}
		else
		{
			global.gTerms.erase(itr++);
		}
		//itr++;
	}
	docOut.close();
}

void saveClass()
{
	std::cout << "Saving Classes\n";
	std::ofstream docOut("characteristics");
	if (docOut == NULL)
		return;
	docOut<< "= " << global.docClasses.size() << "\n";
	
	std::map<std::string,ClassStat>::iterator itr = global.docClasses.begin();
	while (itr != global.docClasses.end())
	{
		docOut<< "+ " << itr->first << " " << itr->second.termSize << " ";
		docOut<< itr->second.dCount << " " << itr->second.cluster << " ";
		docOut<< itr->second.point << "\n";
		
		std::map<std::string,TermShort>::iterator tItr = itr->second.charTerms.begin();
		while (tItr != itr->second.charTerms.end())
		{
			docOut << "- " << tItr->first << " " << tItr->second.count << " ";
			docOut << tItr->second.dCount << " " << tItr->second.idf << " ";
			docOut << tItr->second.count * tItr->second.idf << "\n";
			tItr++;
		}
		docOut << "\n";
		itr++;
	}
	
	docOut.close();
}

void load()
{
	std::ifstream inFile("brain");
	if (inFile == NULL)
		return;
	
	std::string term = "";
	unsigned int dCount, tCount;
	std::string classf;
	float idf;
	char x;
	
	while (!inFile.eof())
	{
		inFile >> x;
		if (x == '+')
		{
			inFile >> term >> tCount >> dCount >> idf;
			global.gTerms[term].count = tCount;
			global.gTerms[term].dCount = dCount;
			global.gTerms[term].idf = idf;
		}
		else if (x == '-')
		{
			inFile >> classf >> tCount;
			global.gTerms[term].cCounts[classf] = tCount;
			global.docClasses[classf].seen = true;
		}
		else if (x == '=')
		{
			inFile >> tCount;
			global.docCount+= tCount;
		}
	}
	inFile.close();
	loadClass();
	//characterizeClasses();
}

void loadClass()
{
	std::ifstream inFile("characteristics");
	if (inFile == NULL)
		return;
	
	std::string term = "";
	unsigned int count, dCount, tCount, size, cls;
	double p;
	std::string classf;
	float idf, w;
	char x;
	
	while (!inFile.eof())
	{
		inFile >> x;
		if (x == '+')
		{
			inFile >> classf >> size >> tCount >> cls >> p;
			global.docClasses[classf].termSize = size;
			global.docClasses[classf].dCount = tCount;
			global.docClasses[classf].cluster = cls;
			global.docClasses[classf].point = p;
			global.docClasses[classf].seen = true;
			
			if (global.clusters.count(cls) == 0)
			{
				double np = global.clusters[cls].meanPoint * global.clusters[cls].count + p;
				global.clusters[cls].count++;
				global.clusters[cls].meanPoint = np/global.clusters[cls].count;
			}
			else
			{
				global.clusters[cls].count = 1;
				global.clusters[cls].meanPoint = p;
			}
		}
		else if (x == '-')
		{
			inFile >> term >> count >> dCount >> idf >> w;
			global.docClasses[classf].charTerms[term].count = count;
			global.docClasses[classf].charTerms[term].dCount = dCount;
			global.docClasses[classf].charTerms[term].idf = idf;
		}
	}
	inFile.close();
	
	cluster(global);
}

//End of file~
